EPA HAP Data Explorer

Explore the data contained in the EPA Hazardous Air Pollutants dataset. This dataset contains a summary of daily Hazardous Air Pollutants from 1990 to 2017.

Download the data from Kaggle.



In [1]:

    
import pandas as pd
from pivottablejs import pivot_ui



In [2]:

    
# CHANGE THIS TO THE LOCATION OF THE FILE
csv_file = "/Users/robert.dempsey/Downloads/epa_hap_daily_summary.csv"



In [3]:

    
hap_df = pd.DataFrame.from_csv(csv_file)



In [4]:

    
# Get the row and column counts
rows_cols = hap_df.shape
print("Rows: {}".format(rows_cols[0]))
print("Columns: {}".format(rows_cols[1]))









    



Rows: 8097069
Columns: 28



In [5]:

    
# List the columns and their types
hap_df.dtypes









    Out[5]:





county_code              int64
site_num                 int64
parameter_code           int64
poc                      int64
latitude               float64
longitude              float64
datum                   object
parameter_name          object
sample_duration         object
pollutant_standard     float64
date_local              object
units_of_measure        object
event_type              object
observation_count        int64
observation_percent    float64
arithmetic_mean        float64
first_max_value        float64
first_max_hour           int64
aqi                    float64
method_code              int64
method_name             object
local_site_name         object
address                 object
state_name              object
county_name             object
city_name               object
cbsa_name               object
date_of_last_change     object
dtype: object



In [6]:

    
# Show summary statistics
hap_df.describe()









    Out[6]:







  
    
      
      county_code
      site_num
      parameter_code
      poc
      latitude
      longitude
      pollutant_standard
      observation_count
      observation_percent
      arithmetic_mean
      first_max_value
      first_max_hour
      aqi
      method_code
    
  
  
    
      count
      8.097069e+06
      8.097069e+06
      8.097069e+06
      8.097069e+06
      8.097069e+06
      8.097069e+06
      0.0
      8.097069e+06
      8.097069e+06
      8.097069e+06
      8.097069e+06
      8.097069e+06
      0.0
      8.097069e+06
    
    
      mean
      8.995137e+01
      1.370898e+03
      6.042453e+04
      2.835583e+00
      3.818801e+01
      -9.434529e+01
      NaN
      1.572091e+00
      9.880965e+01
      4.466537e-01
      5.618971e-01
      7.211276e-01
      NaN
      4.101036e+02
    
    
      std
      1.008587e+02
      2.746745e+03
      2.560563e+04
      2.905911e+00
      5.968574e+00
      1.716207e+01
      NaN
      3.434705e+00
      8.590761e+00
      1.247981e+01
      1.306216e+01
      3.240937e+00
      NaN
      3.298626e+02
    
    
      min
      1.000000e+00
      1.000000e+00
      1.210300e+04
      1.000000e+00
      0.000000e+00
      -1.605083e+02
      NaN
      1.000000e+00
      1.000000e+00
      -1.000000e-01
      -1.000000e-01
      0.000000e+00
      NaN
      8.900000e+01
    
    
      25%
      2.700000e+01
      8.000000e+00
      4.380400e+04
      1.000000e+00
      3.394471e+01
      -1.066766e+02
      NaN
      1.000000e+00
      1.000000e+02
      0.000000e+00
      0.000000e+00
      0.000000e+00
      NaN
      1.160000e+02
    
    
      50%
      6.300000e+01
      4.200000e+01
      4.386000e+04
      1.000000e+00
      3.892185e+01
      -9.278027e+01
      NaN
      1.000000e+00
      1.000000e+02
      8.900000e-04
      8.900000e-04
      0.000000e+00
      NaN
      1.750000e+02
    
    
      75%
      1.190000e+02
      1.007000e+03
      8.811200e+04
      5.000000e+00
      4.219438e+01
      -8.096230e+01
      NaN
      1.000000e+00
      1.000000e+02
      4.000000e-02
      4.000000e-02
      0.000000e+00
      NaN
      8.000000e+02
    
    
      max
      8.400000e+02
      9.998000e+03
      8.813600e+04
      9.900000e+01
      6.709195e+01
      0.000000e+00
      NaN
      9.600000e+01
      1.000000e+02
      2.000000e+04
      2.000000e+04
      2.300000e+01
      NaN
      9.230000e+02



In [7]:

    
# View the last five rows of the dataframe
hap_df.tail()









    Out[7]:







  
    
      
      county_code
      site_num
      parameter_code
      poc
      latitude
      longitude
      datum
      parameter_name
      sample_duration
      pollutant_standard
      ...
      aqi
      method_code
      method_name
      local_site_name
      address
      state_name
      county_name
      city_name
      cbsa_name
      date_of_last_change
    
    
      state_code
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      53
      33
      80
      43804
      6
      47.568236
      -122.308628
      WGS84
      Carbon tetrachloride
      24 HOUR
      NaN
      ...
      NaN
      101
      CANISTER SUBAMBIENT PRESSURE - MULTI DETECTOR GC
      SEATTLE - BEACON HILL
      4103 BEACON HILL S
      Washington
      King
      Seattle
      Seattle-Tacoma-Bellevue WA
      2015-03-17
    
    
      6
      19
      8
      45201
      5
      36.781333
      -119.773190
      NAD83
      Benzene
      24 HOUR
      NaN
      ...
      NaN
      171
      6L Pressurized Canister - Precon Saturn GC/MS
      3425 N FIRST ST FRESNO
      3425 N FIRST ST FRESNO
      California
      Fresno
      Fresno
      Fresno CA
      2013-06-11
    
    
      13
      21
      12
      43802
      1
      32.805408
      -83.543521
      WGS84
      Dichloromethane
      24 HOUR
      NaN
      ...
      NaN
      150
      SS 6L- PRESSURIZED CANISTER - CRYOGENIC PRECON...
      Macon SE
      Georgia Forestry Commission 5645 Riggins Mill ...
      Georgia
      Bibb
      Macon
      Macon GA
      2013-06-11
    
    
      53
      47
      12
      88103
      1
      48.387531
      -119.928671
      WGS84
      Arsenic PM2.5 LC
      24 HOUR
      NaN
      ...
      NaN
      800
      IMPROVE Module A with Cyclone Inlet-Teflon Fil...
      NaN
      Pasayten
      Washington
      Okanogan
      Not in a city
      NaN
      2015-07-28
    
    
      33
      15
      14
      88136
      5
      43.075333
      -70.748000
      WGS84
      Nickel PM2.5 LC
      24 HOUR
      NaN
      ...
      NaN
      821
      Andersen RAAS Teflon - Energy Dispersive XRF
      PORTSMOUTH - PEIRCE ISLAND
      PORTSMOUTH PEIRCE ISLAND
      New Hampshire
      Rockingham
      Portsmouth
      Boston-Cambridge-Newton MA-NH
      2015-07-22
    
  

5 rows × 28 columns



In [8]:

    
# Print all columns and values for the last row in the dataset
hap_df.tail(1).to_dict(orient='records')









    Out[8]:





[{'address': 'PORTSMOUTH PEIRCE ISLAND',
  'aqi': nan,
  'arithmetic_mean': 0.0,
  'cbsa_name': 'Boston-Cambridge-Newton MA-NH',
  'city_name': 'Portsmouth',
  'county_code': 15,
  'county_name': 'Rockingham',
  'date_local': '2004-03-13',
  'date_of_last_change': '2015-07-22',
  'datum': 'WGS84',
  'event_type': 'None',
  'first_max_hour': 0,
  'first_max_value': 0.0,
  'latitude': 43.075333,
  'local_site_name': 'PORTSMOUTH - PEIRCE ISLAND',
  'longitude': -70.748,
  'method_code': 821,
  'method_name': 'Andersen RAAS Teflon - Energy Dispersive XRF',
  'observation_count': 1,
  'observation_percent': 100.0,
  'parameter_code': 88136,
  'parameter_name': 'Nickel PM2.5 LC',
  'poc': 5,
  'pollutant_standard': nan,
  'sample_duration': '24 HOUR',
  'site_num': 14,
  'state_name': 'New Hampshire',
  'units_of_measure': 'Micrograms/cubic meter (LC)'}]



In [9]:

    
# Check to see if there are any null values in any of the columns
null_values_present = hap_df.isnull().values.any()

if null_values_present:
    nan_rows = hap_df[hap_df.isnull().T.any().T]
    print("Rows with empty values: {}".format(len(nan_rows)))
    print("Filling empty values")
    hap_df.fillna(0, inplace=True)
    print("Empty values filled")









    



Rows with empty values: 8097069
Filling empty values
Empty values filled



In [10]:

    
# Create a dataframe from the last 60 rows and use PivotTable.js to view the data
hap_pivot_df = hap_df.tail(60)

# Create a column from the timestamp index and create a new numerical index
hap_pivot_df.reset_index(level=0, inplace=True)

# Show the pivot table
pivot_ui(hap_pivot_df)









    Out[10]:



In [ ]:

	county_code	site_num	parameter_code	poc	latitude	longitude	pollutant_standard	observation_count	observation_percent	arithmetic_mean	first_max_value	first_max_hour	aqi	method_code
count	8.097069e+06	8.097069e+06	8.097069e+06	8.097069e+06	8.097069e+06	8.097069e+06	0.0	8.097069e+06	8.097069e+06	8.097069e+06	8.097069e+06	8.097069e+06	0.0	8.097069e+06
mean	8.995137e+01	1.370898e+03	6.042453e+04	2.835583e+00	3.818801e+01	-9.434529e+01	NaN	1.572091e+00	9.880965e+01	4.466537e-01	5.618971e-01	7.211276e-01	NaN	4.101036e+02
std	1.008587e+02	2.746745e+03	2.560563e+04	2.905911e+00	5.968574e+00	1.716207e+01	NaN	3.434705e+00	8.590761e+00	1.247981e+01	1.306216e+01	3.240937e+00	NaN	3.298626e+02
min	1.000000e+00	1.000000e+00	1.210300e+04	1.000000e+00	0.000000e+00	-1.605083e+02	NaN	1.000000e+00	1.000000e+00	-1.000000e-01	-1.000000e-01	0.000000e+00	NaN	8.900000e+01
25%	2.700000e+01	8.000000e+00	4.380400e+04	1.000000e+00	3.394471e+01	-1.066766e+02	NaN	1.000000e+00	1.000000e+02	0.000000e+00	0.000000e+00	0.000000e+00	NaN	1.160000e+02
50%	6.300000e+01	4.200000e+01	4.386000e+04	1.000000e+00	3.892185e+01	-9.278027e+01	NaN	1.000000e+00	1.000000e+02	8.900000e-04	8.900000e-04	0.000000e+00	NaN	1.750000e+02
75%	1.190000e+02	1.007000e+03	8.811200e+04	5.000000e+00	4.219438e+01	-8.096230e+01	NaN	1.000000e+00	1.000000e+02	4.000000e-02	4.000000e-02	0.000000e+00	NaN	8.000000e+02
max	8.400000e+02	9.998000e+03	8.813600e+04	9.900000e+01	6.709195e+01	0.000000e+00	NaN	9.600000e+01	1.000000e+02	2.000000e+04	2.000000e+04	2.300000e+01	NaN	9.230000e+02

	county_code	site_num	parameter_code	poc	latitude	longitude	datum	parameter_name	sample_duration	pollutant_standard	...	aqi	method_code	method_name	local_site_name	address	state_name	county_name	city_name	cbsa_name	date_of_last_change
state_code
53	33	80	43804	6	47.568236	-122.308628	WGS84	Carbon tetrachloride	24 HOUR	NaN	...	NaN	101	CANISTER SUBAMBIENT PRESSURE - MULTI DETECTOR GC	SEATTLE - BEACON HILL	4103 BEACON HILL S	Washington	King	Seattle	Seattle-Tacoma-Bellevue WA	2015-03-17
6	19	8	45201	5	36.781333	-119.773190	NAD83	Benzene	24 HOUR	NaN	...	NaN	171	6L Pressurized Canister - Precon Saturn GC/MS	3425 N FIRST ST FRESNO	3425 N FIRST ST FRESNO	California	Fresno	Fresno	Fresno CA	2013-06-11
13	21	12	43802	1	32.805408	-83.543521	WGS84	Dichloromethane	24 HOUR	NaN	...	NaN	150	SS 6L- PRESSURIZED CANISTER - CRYOGENIC PRECON...	Macon SE	Georgia Forestry Commission 5645 Riggins Mill ...	Georgia	Bibb	Macon	Macon GA	2013-06-11
53	47	12	88103	1	48.387531	-119.928671	WGS84	Arsenic PM2.5 LC	24 HOUR	NaN	...	NaN	800	IMPROVE Module A with Cyclone Inlet-Teflon Fil...	NaN	Pasayten	Washington	Okanogan	Not in a city	NaN	2015-07-28
33	15	14	88136	5	43.075333	-70.748000	WGS84	Nickel PM2.5 LC	24 HOUR	NaN	...	NaN	821	Andersen RAAS Teflon - Energy Dispersive XRF	PORTSMOUTH - PEIRCE ISLAND	PORTSMOUTH PEIRCE ISLAND	New Hampshire	Rockingham	Portsmouth	Boston-Cambridge-Newton MA-NH	2015-07-22